{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark import SparkContext\n",
    "from pyspark.sql import SparkSession\n",
    "from pyspark.sql.functions import *\n",
    "from pyspark.sql import Row\n",
    "from pyspark.sql import Column"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "sc=SparkContext(appName=\"csv_dataframe\") \n",
    "spark=SparkSession.builder.appName('csv_dataframe').master(\"Yct201811021847\").getOrCreate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_df_from_csv():\n",
    "    df=spark.read.csv('iris_dataset.csv',header=True, inferSchema=True)\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "csv_df=create_df_from_csv()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- sepal_length: double (nullable = true)\n",
      " |-- sepal_width: double (nullable = true)\n",
      " |-- petal_length: double (nullable = true)\n",
      " |-- petal_width: double (nullable = true)\n",
      " |-- species: string (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "csv_df.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+------------------+-------------------+------------------+------------------+---------+\n",
      "|summary|      sepal_length|        sepal_width|      petal_length|       petal_width|  species|\n",
      "+-------+------------------+-------------------+------------------+------------------+---------+\n",
      "|  count|               150|                150|               150|               150|      150|\n",
      "|   mean| 5.843333333333335| 3.0540000000000007|3.7586666666666693|1.1986666666666672|     null|\n",
      "| stddev|0.8280661279778637|0.43359431136217375| 1.764420419952262|0.7631607417008414|     null|\n",
      "|    min|               4.3|                2.0|               1.0|               0.1|   setosa|\n",
      "|    25%|               5.1|                2.8|               1.6|               0.3|     null|\n",
      "|    50%|               5.8|                3.0|               4.3|               1.3|     null|\n",
      "|    75%|               6.4|                3.3|               5.1|               1.8|     null|\n",
      "|    max|               7.9|                4.4|               6.9|               2.5|virginica|\n",
      "+-------+------------------+-------------------+------------------+------------------+---------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "csv_df.summary().show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr_csv=csv_df.select(array(csv_df.sepal_length,csv_df.sepal_width,csv_df.petal_length,csv_df.petal_width).alias('arr_data'),csv_df.species)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DataFrame[array_max(arr_data): double, array_min(arr_data): double]"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "arr_csv.select(array_max(arr_csv.arr_data),array_min(arr_csv.arr_data))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- arr_data: array (nullable = false)\n",
      " |    |-- element: double (containsNull = true)\n",
      " |-- species: string (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "arr_csv.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "rdd_csv=arr_csv.rdd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Row(arr_data=[5.1, 3.5, 1.4, 0.2], species=u'setosa'),\n",
       " Row(arr_data=[4.9, 3.0, 1.4, 0.2], species=u'setosa'),\n",
       " Row(arr_data=[4.7, 3.2, 1.3, 0.2], species=u'setosa'),\n",
       " Row(arr_data=[4.6, 3.1, 1.5, 0.2], species=u'setosa'),\n",
       " Row(arr_data=[5.0, 3.6, 1.4, 0.2], species=u'setosa'),\n",
       " Row(arr_data=[5.4, 3.9, 1.7, 0.4], species=u'setosa'),\n",
       " Row(arr_data=[4.6, 3.4, 1.4, 0.3], species=u'setosa'),\n",
       " Row(arr_data=[5.0, 3.4, 1.5, 0.2], species=u'setosa'),\n",
       " Row(arr_data=[4.4, 2.9, 1.4, 0.2], species=u'setosa'),\n",
       " Row(arr_data=[4.9, 3.1, 1.5, 0.1], species=u'setosa'),\n",
       " Row(arr_data=[5.4, 3.7, 1.5, 0.2], species=u'setosa'),\n",
       " Row(arr_data=[4.8, 3.4, 1.6, 0.2], species=u'setosa'),\n",
       " Row(arr_data=[4.8, 3.0, 1.4, 0.1], species=u'setosa'),\n",
       " Row(arr_data=[4.3, 3.0, 1.1, 0.1], species=u'setosa'),\n",
       " Row(arr_data=[5.8, 4.0, 1.2, 0.2], species=u'setosa'),\n",
       " Row(arr_data=[5.7, 4.4, 1.5, 0.4], species=u'setosa'),\n",
       " Row(arr_data=[5.4, 3.9, 1.3, 0.4], species=u'setosa'),\n",
       " Row(arr_data=[5.1, 3.5, 1.4, 0.3], species=u'setosa'),\n",
       " Row(arr_data=[5.7, 3.8, 1.7, 0.3], species=u'setosa'),\n",
       " Row(arr_data=[5.1, 3.8, 1.5, 0.3], species=u'setosa'),\n",
       " Row(arr_data=[5.4, 3.4, 1.7, 0.2], species=u'setosa'),\n",
       " Row(arr_data=[5.1, 3.7, 1.5, 0.4], species=u'setosa'),\n",
       " Row(arr_data=[4.6, 3.6, 1.0, 0.2], species=u'setosa'),\n",
       " Row(arr_data=[5.1, 3.3, 1.7, 0.5], species=u'setosa'),\n",
       " Row(arr_data=[4.8, 3.4, 1.9, 0.2], species=u'setosa'),\n",
       " Row(arr_data=[5.0, 3.0, 1.6, 0.2], species=u'setosa'),\n",
       " Row(arr_data=[5.0, 3.4, 1.6, 0.4], species=u'setosa'),\n",
       " Row(arr_data=[5.2, 3.5, 1.5, 0.2], species=u'setosa'),\n",
       " Row(arr_data=[5.2, 3.4, 1.4, 0.2], species=u'setosa'),\n",
       " Row(arr_data=[4.7, 3.2, 1.6, 0.2], species=u'setosa'),\n",
       " Row(arr_data=[4.8, 3.1, 1.6, 0.2], species=u'setosa'),\n",
       " Row(arr_data=[5.4, 3.4, 1.5, 0.4], species=u'setosa'),\n",
       " Row(arr_data=[5.2, 4.1, 1.5, 0.1], species=u'setosa'),\n",
       " Row(arr_data=[5.5, 4.2, 1.4, 0.2], species=u'setosa'),\n",
       " Row(arr_data=[4.9, 3.1, 1.5, 0.1], species=u'setosa'),\n",
       " Row(arr_data=[5.0, 3.2, 1.2, 0.2], species=u'setosa'),\n",
       " Row(arr_data=[5.5, 3.5, 1.3, 0.2], species=u'setosa'),\n",
       " Row(arr_data=[4.9, 3.1, 1.5, 0.1], species=u'setosa'),\n",
       " Row(arr_data=[4.4, 3.0, 1.3, 0.2], species=u'setosa'),\n",
       " Row(arr_data=[5.1, 3.4, 1.5, 0.2], species=u'setosa'),\n",
       " Row(arr_data=[5.0, 3.5, 1.3, 0.3], species=u'setosa'),\n",
       " Row(arr_data=[4.5, 2.3, 1.3, 0.3], species=u'setosa'),\n",
       " Row(arr_data=[4.4, 3.2, 1.3, 0.2], species=u'setosa'),\n",
       " Row(arr_data=[5.0, 3.5, 1.6, 0.6], species=u'setosa'),\n",
       " Row(arr_data=[5.1, 3.8, 1.9, 0.4], species=u'setosa'),\n",
       " Row(arr_data=[4.8, 3.0, 1.4, 0.3], species=u'setosa'),\n",
       " Row(arr_data=[5.1, 3.8, 1.6, 0.2], species=u'setosa'),\n",
       " Row(arr_data=[4.6, 3.2, 1.4, 0.2], species=u'setosa'),\n",
       " Row(arr_data=[5.3, 3.7, 1.5, 0.2], species=u'setosa'),\n",
       " Row(arr_data=[5.0, 3.3, 1.4, 0.2], species=u'setosa'),\n",
       " Row(arr_data=[7.0, 3.2, 4.7, 1.4], species=u'versicolor'),\n",
       " Row(arr_data=[6.4, 3.2, 4.5, 1.5], species=u'versicolor'),\n",
       " Row(arr_data=[6.9, 3.1, 4.9, 1.5], species=u'versicolor'),\n",
       " Row(arr_data=[5.5, 2.3, 4.0, 1.3], species=u'versicolor'),\n",
       " Row(arr_data=[6.5, 2.8, 4.6, 1.5], species=u'versicolor'),\n",
       " Row(arr_data=[5.7, 2.8, 4.5, 1.3], species=u'versicolor'),\n",
       " Row(arr_data=[6.3, 3.3, 4.7, 1.6], species=u'versicolor'),\n",
       " Row(arr_data=[4.9, 2.4, 3.3, 1.0], species=u'versicolor'),\n",
       " Row(arr_data=[6.6, 2.9, 4.6, 1.3], species=u'versicolor'),\n",
       " Row(arr_data=[5.2, 2.7, 3.9, 1.4], species=u'versicolor'),\n",
       " Row(arr_data=[5.0, 2.0, 3.5, 1.0], species=u'versicolor'),\n",
       " Row(arr_data=[5.9, 3.0, 4.2, 1.5], species=u'versicolor'),\n",
       " Row(arr_data=[6.0, 2.2, 4.0, 1.0], species=u'versicolor'),\n",
       " Row(arr_data=[6.1, 2.9, 4.7, 1.4], species=u'versicolor'),\n",
       " Row(arr_data=[5.6, 2.9, 3.6, 1.3], species=u'versicolor'),\n",
       " Row(arr_data=[6.7, 3.1, 4.4, 1.4], species=u'versicolor'),\n",
       " Row(arr_data=[5.6, 3.0, 4.5, 1.5], species=u'versicolor'),\n",
       " Row(arr_data=[5.8, 2.7, 4.1, 1.0], species=u'versicolor'),\n",
       " Row(arr_data=[6.2, 2.2, 4.5, 1.5], species=u'versicolor'),\n",
       " Row(arr_data=[5.6, 2.5, 3.9, 1.1], species=u'versicolor'),\n",
       " Row(arr_data=[5.9, 3.2, 4.8, 1.8], species=u'versicolor'),\n",
       " Row(arr_data=[6.1, 2.8, 4.0, 1.3], species=u'versicolor'),\n",
       " Row(arr_data=[6.3, 2.5, 4.9, 1.5], species=u'versicolor'),\n",
       " Row(arr_data=[6.1, 2.8, 4.7, 1.2], species=u'versicolor'),\n",
       " Row(arr_data=[6.4, 2.9, 4.3, 1.3], species=u'versicolor'),\n",
       " Row(arr_data=[6.6, 3.0, 4.4, 1.4], species=u'versicolor'),\n",
       " Row(arr_data=[6.8, 2.8, 4.8, 1.4], species=u'versicolor'),\n",
       " Row(arr_data=[6.7, 3.0, 5.0, 1.7], species=u'versicolor'),\n",
       " Row(arr_data=[6.0, 2.9, 4.5, 1.5], species=u'versicolor'),\n",
       " Row(arr_data=[5.7, 2.6, 3.5, 1.0], species=u'versicolor'),\n",
       " Row(arr_data=[5.5, 2.4, 3.8, 1.1], species=u'versicolor'),\n",
       " Row(arr_data=[5.5, 2.4, 3.7, 1.0], species=u'versicolor'),\n",
       " Row(arr_data=[5.8, 2.7, 3.9, 1.2], species=u'versicolor'),\n",
       " Row(arr_data=[6.0, 2.7, 5.1, 1.6], species=u'versicolor'),\n",
       " Row(arr_data=[5.4, 3.0, 4.5, 1.5], species=u'versicolor'),\n",
       " Row(arr_data=[6.0, 3.4, 4.5, 1.6], species=u'versicolor'),\n",
       " Row(arr_data=[6.7, 3.1, 4.7, 1.5], species=u'versicolor'),\n",
       " Row(arr_data=[6.3, 2.3, 4.4, 1.3], species=u'versicolor'),\n",
       " Row(arr_data=[5.6, 3.0, 4.1, 1.3], species=u'versicolor'),\n",
       " Row(arr_data=[5.5, 2.5, 4.0, 1.3], species=u'versicolor'),\n",
       " Row(arr_data=[5.5, 2.6, 4.4, 1.2], species=u'versicolor'),\n",
       " Row(arr_data=[6.1, 3.0, 4.6, 1.4], species=u'versicolor'),\n",
       " Row(arr_data=[5.8, 2.6, 4.0, 1.2], species=u'versicolor'),\n",
       " Row(arr_data=[5.0, 2.3, 3.3, 1.0], species=u'versicolor'),\n",
       " Row(arr_data=[5.6, 2.7, 4.2, 1.3], species=u'versicolor'),\n",
       " Row(arr_data=[5.7, 3.0, 4.2, 1.2], species=u'versicolor'),\n",
       " Row(arr_data=[5.7, 2.9, 4.2, 1.3], species=u'versicolor'),\n",
       " Row(arr_data=[6.2, 2.9, 4.3, 1.3], species=u'versicolor'),\n",
       " Row(arr_data=[5.1, 2.5, 3.0, 1.1], species=u'versicolor'),\n",
       " Row(arr_data=[5.7, 2.8, 4.1, 1.3], species=u'versicolor'),\n",
       " Row(arr_data=[6.3, 3.3, 6.0, 2.5], species=u'virginica'),\n",
       " Row(arr_data=[5.8, 2.7, 5.1, 1.9], species=u'virginica'),\n",
       " Row(arr_data=[7.1, 3.0, 5.9, 2.1], species=u'virginica'),\n",
       " Row(arr_data=[6.3, 2.9, 5.6, 1.8], species=u'virginica'),\n",
       " Row(arr_data=[6.5, 3.0, 5.8, 2.2], species=u'virginica'),\n",
       " Row(arr_data=[7.6, 3.0, 6.6, 2.1], species=u'virginica'),\n",
       " Row(arr_data=[4.9, 2.5, 4.5, 1.7], species=u'virginica'),\n",
       " Row(arr_data=[7.3, 2.9, 6.3, 1.8], species=u'virginica'),\n",
       " Row(arr_data=[6.7, 2.5, 5.8, 1.8], species=u'virginica'),\n",
       " Row(arr_data=[7.2, 3.6, 6.1, 2.5], species=u'virginica'),\n",
       " Row(arr_data=[6.5, 3.2, 5.1, 2.0], species=u'virginica'),\n",
       " Row(arr_data=[6.4, 2.7, 5.3, 1.9], species=u'virginica'),\n",
       " Row(arr_data=[6.8, 3.0, 5.5, 2.1], species=u'virginica'),\n",
       " Row(arr_data=[5.7, 2.5, 5.0, 2.0], species=u'virginica'),\n",
       " Row(arr_data=[5.8, 2.8, 5.1, 2.4], species=u'virginica'),\n",
       " Row(arr_data=[6.4, 3.2, 5.3, 2.3], species=u'virginica'),\n",
       " Row(arr_data=[6.5, 3.0, 5.5, 1.8], species=u'virginica'),\n",
       " Row(arr_data=[7.7, 3.8, 6.7, 2.2], species=u'virginica'),\n",
       " Row(arr_data=[7.7, 2.6, 6.9, 2.3], species=u'virginica'),\n",
       " Row(arr_data=[6.0, 2.2, 5.0, 1.5], species=u'virginica'),\n",
       " Row(arr_data=[6.9, 3.2, 5.7, 2.3], species=u'virginica'),\n",
       " Row(arr_data=[5.6, 2.8, 4.9, 2.0], species=u'virginica'),\n",
       " Row(arr_data=[7.7, 2.8, 6.7, 2.0], species=u'virginica'),\n",
       " Row(arr_data=[6.3, 2.7, 4.9, 1.8], species=u'virginica'),\n",
       " Row(arr_data=[6.7, 3.3, 5.7, 2.1], species=u'virginica'),\n",
       " Row(arr_data=[7.2, 3.2, 6.0, 1.8], species=u'virginica'),\n",
       " Row(arr_data=[6.2, 2.8, 4.8, 1.8], species=u'virginica'),\n",
       " Row(arr_data=[6.1, 3.0, 4.9, 1.8], species=u'virginica'),\n",
       " Row(arr_data=[6.4, 2.8, 5.6, 2.1], species=u'virginica'),\n",
       " Row(arr_data=[7.2, 3.0, 5.8, 1.6], species=u'virginica'),\n",
       " Row(arr_data=[7.4, 2.8, 6.1, 1.9], species=u'virginica'),\n",
       " Row(arr_data=[7.9, 3.8, 6.4, 2.0], species=u'virginica'),\n",
       " Row(arr_data=[6.4, 2.8, 5.6, 2.2], species=u'virginica'),\n",
       " Row(arr_data=[6.3, 2.8, 5.1, 1.5], species=u'virginica'),\n",
       " Row(arr_data=[6.1, 2.6, 5.6, 1.4], species=u'virginica'),\n",
       " Row(arr_data=[7.7, 3.0, 6.1, 2.3], species=u'virginica'),\n",
       " Row(arr_data=[6.3, 3.4, 5.6, 2.4], species=u'virginica'),\n",
       " Row(arr_data=[6.4, 3.1, 5.5, 1.8], species=u'virginica'),\n",
       " Row(arr_data=[6.0, 3.0, 4.8, 1.8], species=u'virginica'),\n",
       " Row(arr_data=[6.9, 3.1, 5.4, 2.1], species=u'virginica'),\n",
       " Row(arr_data=[6.7, 3.1, 5.6, 2.4], species=u'virginica'),\n",
       " Row(arr_data=[6.9, 3.1, 5.1, 2.3], species=u'virginica'),\n",
       " Row(arr_data=[5.8, 2.7, 5.1, 1.9], species=u'virginica'),\n",
       " Row(arr_data=[6.8, 3.2, 5.9, 2.3], species=u'virginica'),\n",
       " Row(arr_data=[6.7, 3.3, 5.7, 2.5], species=u'virginica'),\n",
       " Row(arr_data=[6.7, 3.0, 5.2, 2.3], species=u'virginica'),\n",
       " Row(arr_data=[6.3, 2.5, 5.0, 1.9], species=u'virginica'),\n",
       " Row(arr_data=[6.5, 3.0, 5.2, 2.0], species=u'virginica'),\n",
       " Row(arr_data=[6.2, 3.4, 5.4, 2.3], species=u'virginica'),\n",
       " Row(arr_data=[5.9, 3.0, 5.1, 1.8], species=u'virginica')]"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rdd_csv.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "gp_csv=csv_df.groupBy(['species'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [],
   "source": [
    "g1=gp_csv.avg('sepal_length','sepal_width','petal_length','petal_width')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [],
   "source": [
    "g2=gp_csv.mean('sepal_length','sepal_width','petal_length','petal_width')\n",
    "g3=gp_csv.max('sepal_length','sepal_width','petal_length','petal_width')\n",
    "g4=gp_csv.min('sepal_length','sepal_width','petal_length','petal_width')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [],
   "source": [
    "g_all=g1.union(g2).union(g3).union(g4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- species: string (nullable = true)\n",
      " |-- avg(sepal_length): double (nullable = true)\n",
      " |-- avg(sepal_width): double (nullable = true)\n",
      " |-- avg(petal_length): double (nullable = true)\n",
      " |-- avg(petal_width): double (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "g1.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Row(species=u'virginica', avg(sepal_length)=6.587999999999998, avg(sepal_width)=2.9739999999999998, avg(petal_length)=5.552, avg(petal_width)=2.026),\n",
       " Row(species=u'versicolor', avg(sepal_length)=5.936, avg(sepal_width)=2.7700000000000005, avg(petal_length)=4.26, avg(petal_width)=1.3259999999999998),\n",
       " Row(species=u'setosa', avg(sepal_length)=5.005999999999999, avg(sepal_width)=3.4180000000000006, avg(petal_length)=1.464, avg(petal_width)=0.2439999999999999)]"
      ]
     },
     "execution_count": 89,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "g1.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
